See tokenize on Wiktionary
{
"derived": [
{
"_dis1": "0 0 0",
"word": "detokenize"
},
{
"_dis1": "0 0 0",
"word": "mistokenize"
},
{
"_dis1": "0 0 0",
"word": "retokenize"
},
{
"_dis1": "0 0 0",
"word": "tokenizable"
},
{
"_dis1": "0 0 0",
"word": "tokenization"
},
{
"_dis1": "0 0 0",
"word": "tokenizer"
},
{
"_dis1": "0 0 0",
"word": "untokenized"
}
],
"etymology_templates": [
{
"args": {
"1": "en",
"2": ":af",
"3": "token",
"4": "-ize",
"text": "+",
"tree": "1"
},
"expansion": "Etymology tree\nEnglish token\nProto-Indo-European *-id-\nProto-Indo-European *-yéti\nProto-Indo-European *-idyéti\nProto-Hellenic *-íďďō\nAncient Greek -ῐ́ζω (-ĭ́zō)bor.\nLate Latin -izōder.\nMiddle French -iserbor.\nMiddle English -isen\nEnglish -ize\nEnglish tokenize\n[Appendix:Glossary#loanword|Borrowed]] from\", \"terms\" : [ { \"children\" : [ { \"keyword_abbrev\" : \"der.\", \"keyword_label\" : \"Derived from\", \"terms\" : [ { \"children\" : [ { \"keyword_abbrev\" : \"bor.\", \"keyword_label\" : \"Borrowed from\", \"terms\" : [ { \"id\" : \"verbal\", \"children\" : [ { \"terms\" : [ { \"children\" : [ { \"terms\" : [ { \"children\" : [ { \"terms\" : [ { \"children\" : [ ], \"lang_name\" : \"Proto-Indo-European\", \"term\" : \"*-id-\", \"status\" : \"missing\", \"lang\" : \"ine-pro\" }, { \"children\" : [ ], \"lang_name\" : \"Proto-Indo-European\", \"term\" : \"*-yéti\", \"status\" : \"ok\", \"lang\" : \"ine-pro\" } ], \"keyword_label\" : \"From\", \"is_group\" : true, \"keyword\" : \"affix\" } ], \"lang_name\" : \"Proto-Indo-European\", \"term\" : \"*-idyéti\", \"status\" : \"inline\", \"lang\" : \"ine-pro\" } ], \"keyword_label\" : \"Inherited from\", \"keyword\" : \"inherited\" } ], \"lang_name\" : \"Proto-Hellenic\", \"term\" : \"*-íďďō\", \"status\" : \"inline\", \"lang\" : \"grk-pro\" } ], \"keyword_label\" : \"Inherited from\", \"keyword\" : \"inherited\" } ], \"status\" : \"ok\", \"lang_name\" : \"Ancient Greek\", \"term\" : \"-ῐ́ζω\", \"lang\" : \"grc\" } ], \"keyword\" : \"bor\" } ], \"lang_name\" : \"Late Latin\", \"term\" : \"-izō\", \"status\" : \"ok\", \"lang\" : \"la-lat\" } ], \"keyword\" : \"derived\" } ], \"lang_name\" : \"Middle French\", \"term\" : \"-iser\", \"status\" : \"inline\", \"lang\" : \"frm\" } ], \"keyword\" : \"bor\" } ], \"lang_name\" : \"Middle English\", \"term\" : \"-isen\", \"status\" : \"inline\", \"lang\" : \"enm\" } ], \"keyword_label\" : \"Inherited from\", \"keyword\" : \"inherited\" } ], \"lang_name\" : \"English\", \"term\" : \"-ize\", \"status\" : \"ok\", \"lang\" : \"en\" } ], \"keyword_label\" : \"From\", \"is_group\" : true, \"keyword\" : \"affix\" } ], \"lang_name\" : \"English\", \"term\" : \"tokenize\", \"status\" : \"ok\", \"lang\" : \"en\" }\" data-lang=\"en\" data-title=\"tokenize\">\nFrom token + -ize.",
"name": "ety"
}
],
"etymology_text": "Etymology tree\nEnglish token\nProto-Indo-European *-id-\nProto-Indo-European *-yéti\nProto-Indo-European *-idyéti\nProto-Hellenic *-íďďō\nAncient Greek -ῐ́ζω (-ĭ́zō)bor.\nLate Latin -izōder.\nMiddle French -iserbor.\nMiddle English -isen\nEnglish -ize\nEnglish tokenize\nFrom token + -ize.",
"forms": [
{
"form": "tokenizes",
"tags": [
"present",
"singular",
"third-person"
]
},
{
"form": "tokenizing",
"tags": [
"participle",
"present"
]
},
{
"form": "tokenized",
"tags": [
"participle",
"past"
]
},
{
"form": "tokenized",
"tags": [
"past"
]
},
{
"form": "tokenise",
"tags": [
"alternative"
]
}
],
"head_templates": [
{
"args": {},
"expansion": "tokenize (third-person singular simple present tokenizes, present participle tokenizing, simple past and past participle tokenized)",
"name": "en-verb"
}
],
"lang": "English",
"lang_code": "en",
"pos": "verb",
"related": [
{
"_dis1": "0 0 0",
"word": "tokenism"
}
],
"senses": [
{
"categories": [
{
"kind": "other",
"langcode": "en",
"name": "Computing",
"orig": "en:Computing",
"parents": [],
"source": "w"
},
{
"_dis": "53 30 17",
"kind": "other",
"name": "English entries referencing missing etymons",
"parents": [],
"source": "w+disamb"
},
{
"_dis": "50 29 21",
"kind": "other",
"name": "English entries with etymology texts",
"parents": [],
"source": "w+disamb"
},
{
"_dis": "50 29 21",
"kind": "other",
"name": "English entries with etymology trees",
"parents": [],
"source": "w+disamb"
},
{
"_dis": "57 35 7",
"kind": "other",
"name": "English entries with incorrect language header",
"parents": [],
"source": "w+disamb"
},
{
"_dis": "36 27 37",
"kind": "other",
"name": "English terms suffixed with -ize",
"parents": [],
"source": "w+disamb"
},
{
"_dis": "54 29 17",
"kind": "other",
"name": "Pages using etymon with no ID",
"parents": [],
"source": "w+disamb"
},
{
"_dis": "63 31 6",
"kind": "other",
"name": "Pages with 1 entry",
"parents": [],
"source": "w+disamb"
},
{
"_dis": "64 31 5",
"kind": "other",
"name": "Pages with entries",
"parents": [],
"source": "w+disamb"
},
{
"_dis": "54 30 17",
"kind": "other",
"name": "Pages with etymology trees",
"parents": [],
"source": "w+disamb"
}
],
"coordinate_terms": [
{
"word": "codify"
},
{
"word": "encode"
}
],
"glosses": [
"To reduce to a token or set of tokens by lexical analysis."
],
"id": "en-tokenize-en-verb-jT9kOSc9",
"links": [
[
"computing",
"computing#Noun"
],
[
"token",
"token"
],
[
"lexical analysis",
"lexical analysis"
]
],
"raw_glosses": [
"(transitive, computing) To reduce to a token or set of tokens by lexical analysis."
],
"tags": [
"transitive"
],
"topics": [
"computing",
"engineering",
"mathematics",
"natural-sciences",
"physical-sciences",
"sciences"
]
},
{
"categories": [
{
"kind": "other",
"langcode": "en",
"name": "Computing",
"orig": "en:Computing",
"parents": [],
"source": "w"
},
{
"_dis": "36 27 37",
"kind": "other",
"name": "English terms suffixed with -ize",
"parents": [],
"source": "w+disamb"
}
],
"glosses": [
"To substitute sensitive data with meaningless placeholders."
],
"id": "en-tokenize-en-verb--Kj3NOCS",
"links": [
[
"computing",
"computing#Noun"
]
],
"raw_glosses": [
"(transitive, computing) To substitute sensitive data with meaningless placeholders."
],
"tags": [
"transitive"
],
"topics": [
"computing",
"engineering",
"mathematics",
"natural-sciences",
"physical-sciences",
"sciences"
]
},
{
"categories": [
{
"_dis": "36 27 37",
"kind": "other",
"name": "English terms suffixed with -ize",
"parents": [],
"source": "w+disamb"
}
],
"glosses": [
"To treat as a token minority."
],
"id": "en-tokenize-en-verb-bFvBu9Vi",
"links": [
[
"token",
"token"
],
[
"minority",
"minority"
]
],
"raw_glosses": [
"(transitive) To treat as a token minority."
],
"tags": [
"transitive"
]
}
],
"sounds": [
{
"ipa": "/ˈtoʊ.kən.aɪz/",
"tags": [
"General-American"
]
}
],
"word": "tokenize"
}
{
"categories": [
"English 3-syllable words",
"English entries referencing missing etymons",
"English entries with etymology texts",
"English entries with etymology trees",
"English entries with incorrect language header",
"English lemmas",
"English terms suffixed with -ize",
"English verbs",
"Pages using etymon with no ID",
"Pages with 1 entry",
"Pages with entries",
"Pages with etymology trees"
],
"derived": [
{
"word": "detokenize"
},
{
"word": "mistokenize"
},
{
"word": "retokenize"
},
{
"word": "tokenizable"
},
{
"word": "tokenization"
},
{
"word": "tokenizer"
},
{
"word": "untokenized"
}
],
"etymology_templates": [
{
"args": {
"1": "en",
"2": ":af",
"3": "token",
"4": "-ize",
"text": "+",
"tree": "1"
},
"expansion": "Etymology tree\nEnglish token\nProto-Indo-European *-id-\nProto-Indo-European *-yéti\nProto-Indo-European *-idyéti\nProto-Hellenic *-íďďō\nAncient Greek -ῐ́ζω (-ĭ́zō)bor.\nLate Latin -izōder.\nMiddle French -iserbor.\nMiddle English -isen\nEnglish -ize\nEnglish tokenize\n[Appendix:Glossary#loanword|Borrowed]] from\", \"terms\" : [ { \"children\" : [ { \"keyword_abbrev\" : \"der.\", \"keyword_label\" : \"Derived from\", \"terms\" : [ { \"children\" : [ { \"keyword_abbrev\" : \"bor.\", \"keyword_label\" : \"Borrowed from\", \"terms\" : [ { \"id\" : \"verbal\", \"children\" : [ { \"terms\" : [ { \"children\" : [ { \"terms\" : [ { \"children\" : [ { \"terms\" : [ { \"children\" : [ ], \"lang_name\" : \"Proto-Indo-European\", \"term\" : \"*-id-\", \"status\" : \"missing\", \"lang\" : \"ine-pro\" }, { \"children\" : [ ], \"lang_name\" : \"Proto-Indo-European\", \"term\" : \"*-yéti\", \"status\" : \"ok\", \"lang\" : \"ine-pro\" } ], \"keyword_label\" : \"From\", \"is_group\" : true, \"keyword\" : \"affix\" } ], \"lang_name\" : \"Proto-Indo-European\", \"term\" : \"*-idyéti\", \"status\" : \"inline\", \"lang\" : \"ine-pro\" } ], \"keyword_label\" : \"Inherited from\", \"keyword\" : \"inherited\" } ], \"lang_name\" : \"Proto-Hellenic\", \"term\" : \"*-íďďō\", \"status\" : \"inline\", \"lang\" : \"grk-pro\" } ], \"keyword_label\" : \"Inherited from\", \"keyword\" : \"inherited\" } ], \"status\" : \"ok\", \"lang_name\" : \"Ancient Greek\", \"term\" : \"-ῐ́ζω\", \"lang\" : \"grc\" } ], \"keyword\" : \"bor\" } ], \"lang_name\" : \"Late Latin\", \"term\" : \"-izō\", \"status\" : \"ok\", \"lang\" : \"la-lat\" } ], \"keyword\" : \"derived\" } ], \"lang_name\" : \"Middle French\", \"term\" : \"-iser\", \"status\" : \"inline\", \"lang\" : \"frm\" } ], \"keyword\" : \"bor\" } ], \"lang_name\" : \"Middle English\", \"term\" : \"-isen\", \"status\" : \"inline\", \"lang\" : \"enm\" } ], \"keyword_label\" : \"Inherited from\", \"keyword\" : \"inherited\" } ], \"lang_name\" : \"English\", \"term\" : \"-ize\", \"status\" : \"ok\", \"lang\" : \"en\" } ], \"keyword_label\" : \"From\", \"is_group\" : true, \"keyword\" : \"affix\" } ], \"lang_name\" : \"English\", \"term\" : \"tokenize\", \"status\" : \"ok\", \"lang\" : \"en\" }\" data-lang=\"en\" data-title=\"tokenize\">\nFrom token + -ize.",
"name": "ety"
}
],
"etymology_text": "Etymology tree\nEnglish token\nProto-Indo-European *-id-\nProto-Indo-European *-yéti\nProto-Indo-European *-idyéti\nProto-Hellenic *-íďďō\nAncient Greek -ῐ́ζω (-ĭ́zō)bor.\nLate Latin -izōder.\nMiddle French -iserbor.\nMiddle English -isen\nEnglish -ize\nEnglish tokenize\nFrom token + -ize.",
"forms": [
{
"form": "tokenizes",
"tags": [
"present",
"singular",
"third-person"
]
},
{
"form": "tokenizing",
"tags": [
"participle",
"present"
]
},
{
"form": "tokenized",
"tags": [
"participle",
"past"
]
},
{
"form": "tokenized",
"tags": [
"past"
]
},
{
"form": "tokenise",
"tags": [
"alternative"
]
}
],
"head_templates": [
{
"args": {},
"expansion": "tokenize (third-person singular simple present tokenizes, present participle tokenizing, simple past and past participle tokenized)",
"name": "en-verb"
}
],
"lang": "English",
"lang_code": "en",
"pos": "verb",
"related": [
{
"word": "tokenism"
}
],
"senses": [
{
"categories": [
"English transitive verbs",
"en:Computing"
],
"coordinate_terms": [
{
"word": "codify"
},
{
"word": "encode"
}
],
"glosses": [
"To reduce to a token or set of tokens by lexical analysis."
],
"links": [
[
"computing",
"computing#Noun"
],
[
"token",
"token"
],
[
"lexical analysis",
"lexical analysis"
]
],
"raw_glosses": [
"(transitive, computing) To reduce to a token or set of tokens by lexical analysis."
],
"tags": [
"transitive"
],
"topics": [
"computing",
"engineering",
"mathematics",
"natural-sciences",
"physical-sciences",
"sciences"
]
},
{
"categories": [
"English transitive verbs",
"en:Computing"
],
"glosses": [
"To substitute sensitive data with meaningless placeholders."
],
"links": [
[
"computing",
"computing#Noun"
]
],
"raw_glosses": [
"(transitive, computing) To substitute sensitive data with meaningless placeholders."
],
"tags": [
"transitive"
],
"topics": [
"computing",
"engineering",
"mathematics",
"natural-sciences",
"physical-sciences",
"sciences"
]
},
{
"categories": [
"English transitive verbs"
],
"glosses": [
"To treat as a token minority."
],
"links": [
[
"token",
"token"
],
[
"minority",
"minority"
]
],
"raw_glosses": [
"(transitive) To treat as a token minority."
],
"tags": [
"transitive"
]
}
],
"sounds": [
{
"ipa": "/ˈtoʊ.kən.aɪz/",
"tags": [
"General-American"
]
}
],
"word": "tokenize"
}
Download raw JSONL data for tokenize meaning in All languages combined (5.1kB)
This page is a part of the kaikki.org machine-readable All languages combined dictionary. This dictionary is based on structured data extracted on 2026-06-07 from the enwiktionary dump dated 2026-06-01 using wiktextract (e79dea5 and 7f4db16). The data shown on this site has been post-processed and various details (e.g., extra categories) removed, some information disambiguated, and additional data merged from other sources. See the raw data download page for the unprocessed wiktextract data.
If you use this data in academic research, please cite Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data, Proceedings of the 13th Conference on Language Resources and Evaluation (LREC), pp. 1317-1325, Marseille, 20-25 June 2022. Linking to the relevant page(s) under https://kaikki.org would also be greatly appreciated.